tutorials/005 - Glue Catalog.ipynb (711 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[](https://github.com/aws/aws-sdk-pandas)\n",
"\n",
"# 5 - Glue Catalog\n",
"\n",
"[awswrangler](https://github.com/aws/aws-sdk-pandas) makes heavy use of [Glue Catalog](https://aws.amazon.com/glue/) to store metadata of tables and connections."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"import awswrangler as wr"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Enter your bucket name:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ············\n"
]
}
],
"source": [
"import getpass\n",
"\n",
"bucket = getpass.getpass()\n",
"path = f\"s3://{bucket}/data/\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Creating a Pandas DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>price</th>\n",
" <th>in_stock</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>shoes</td>\n",
" <td>50.3</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>tshirt</td>\n",
" <td>10.5</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>ball</td>\n",
" <td>20.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name price in_stock\n",
"0 1 shoes 50.3 True\n",
"1 2 tshirt 10.5 True\n",
"2 3 ball 20.0 False"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(\n",
" {\"id\": [1, 2, 3], \"name\": [\"shoes\", \"tshirt\", \"ball\"], \"price\": [50.3, 10.5, 20.0], \"in_stock\": [True, True, False]}\n",
")\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Checking Glue Catalog Databases"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Database Description\n",
"0 aws_sdk_pandas AWS SDK for pandas Test Arena - Glue Database\n",
"1 default Default Hive database\n"
]
}
],
"source": [
"databases = wr.catalog.databases()\n",
"print(databases)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create the database awswrangler_test if not exists"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Database Description\n",
"0 aws_sdk_pandas AWS SDK for pandas Test Arena - Glue Database\n",
"1 awswrangler_test \n",
"2 default Default Hive database\n"
]
}
],
"source": [
"if \"awswrangler_test\" not in databases.values:\n",
" wr.catalog.create_database(\"awswrangler_test\")\n",
" print(wr.catalog.databases())\n",
"else:\n",
" print(\"Database awswrangler_test already exists\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Checking the empty database"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Database</th>\n",
" <th>Table</th>\n",
" <th>Description</th>\n",
" <th>Columns</th>\n",
" <th>Partitions</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Database, Table, Description, Columns, Partitions]\n",
"Index: []"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.tables(database=\"awswrangler_test\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Writing DataFrames to Data Lake (S3 + Parquet + Glue Catalog)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"desc = \"This is my product table.\"\n",
"\n",
"param = {\"source\": \"Product Web Service\", \"class\": \"e-commerce\"}\n",
"\n",
"comments = {\n",
" \"id\": \"Unique product ID.\",\n",
" \"name\": \"Product name\",\n",
" \"price\": \"Product price (dollar)\",\n",
" \"in_stock\": \"Is this product availaible in the stock?\",\n",
"}\n",
"\n",
"res = wr.s3.to_parquet(\n",
" df=df,\n",
" path=f\"s3://{bucket}/products/\",\n",
" dataset=True,\n",
" database=\"awswrangler_test\",\n",
" table=\"products\",\n",
" mode=\"overwrite\",\n",
" glue_table_settings=wr.typing.GlueTableSettings(description=desc, parameters=param, columns_comments=comments),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Checking Glue Catalog (AWS Console)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Looking Up for the new table!"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Database</th>\n",
" <th>Table</th>\n",
" <th>Description</th>\n",
" <th>Columns</th>\n",
" <th>Partitions</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>awswrangler_test</td>\n",
" <td>products</td>\n",
" <td>This is my product table.</td>\n",
" <td>id, name, price, in_stock</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Database Table Description \\\n",
"0 awswrangler_test products This is my product table. \n",
"\n",
" Columns Partitions \n",
"0 id, name, price, in_stock "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.tables(name_contains=\"roduc\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Database</th>\n",
" <th>Table</th>\n",
" <th>Description</th>\n",
" <th>Columns</th>\n",
" <th>Partitions</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>awswrangler_test</td>\n",
" <td>products</td>\n",
" <td>This is my product table.</td>\n",
" <td>id, name, price, in_stock</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Database Table Description \\\n",
"0 awswrangler_test products This is my product table. \n",
"\n",
" Columns Partitions \n",
"0 id, name, price, in_stock "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.tables(name_prefix=\"pro\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Database</th>\n",
" <th>Table</th>\n",
" <th>Description</th>\n",
" <th>Columns</th>\n",
" <th>Partitions</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>awswrangler_test</td>\n",
" <td>products</td>\n",
" <td>This is my product table.</td>\n",
" <td>id, name, price, in_stock</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Database Table Description \\\n",
"0 awswrangler_test products This is my product table. \n",
"\n",
" Columns Partitions \n",
"0 id, name, price, in_stock "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.tables(name_suffix=\"ts\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Database</th>\n",
" <th>Table</th>\n",
" <th>Description</th>\n",
" <th>Columns</th>\n",
" <th>Partitions</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>awswrangler_test</td>\n",
" <td>products</td>\n",
" <td>This is my product table.</td>\n",
" <td>id, name, price, in_stock</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Database Table Description \\\n",
"0 awswrangler_test products This is my product table. \n",
"\n",
" Columns Partitions \n",
"0 id, name, price, in_stock "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.tables(search_text=\"This is my\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Getting tables details"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Column Name</th>\n",
" <th>Type</th>\n",
" <th>Partition</th>\n",
" <th>Comment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>id</td>\n",
" <td>bigint</td>\n",
" <td>False</td>\n",
" <td>Unique product ID.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>name</td>\n",
" <td>string</td>\n",
" <td>False</td>\n",
" <td>Product name</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>price</td>\n",
" <td>double</td>\n",
" <td>False</td>\n",
" <td>Product price (dollar)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>in_stock</td>\n",
" <td>boolean</td>\n",
" <td>False</td>\n",
" <td>Is this product availaible in the stock?</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Column Name Type Partition Comment\n",
"0 id bigint False Unique product ID.\n",
"1 name string False Product name\n",
"2 price double False Product price (dollar)\n",
"3 in_stock boolean False Is this product availaible in the stock?"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.table(database=\"awswrangler_test\", table=\"products\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning Up the Database"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"for table in wr.catalog.get_tables(database=\"awswrangler_test\"):\n",
" wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=table[\"Name\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete Database"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"wr.catalog.delete_database(\"awswrangler_test\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13 (main, Aug 2 2022, 15:07:42) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
},
"vscode": {
"interpreter": {
"hash": "bd595004b250e5f4145a0d632609b0d8f97d1ccd278d58fafd6840c0467021f9"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}